1 Descriptive stats

describe(a3)

1.1 Dependent variable

hist(a$Score)

hist(a$nwords)

1.2 Independent variable

hist(a$b_concreteness)

hist(a$mcd)

hist(a$usf)

cor.test(a$Score,a$b_concreteness)
## 
##  Pearson's product-moment correlation
## 
## data:  a$Score and a$b_concreteness
## t = -0.17945, df = 478, p-value = 0.8577
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09763610  0.08135285
## sample estimates:
##          cor 
## -0.008207364
cor.test(a$Score,a$mcd)
## 
##  Pearson's product-moment correlation
## 
## data:  a$Score and a$mcd
## t = 8.5114, df = 478, p-value = 2.236e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2824510 0.4380577
## sample estimates:
##       cor 
## 0.3627806
cor.test(a$Score,a$usf)
## 
##  Pearson's product-moment correlation
## 
## data:  a$Score and a$usf
## t = -7.7064, df = 478, p-value = 7.542e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4097460 -0.2503862
## sample estimates:
##        cor 
## -0.3324368

1.2.1 nsubjects

hist(a$v_nsubj_deltap_govcue)

hist(a$v_nsubj_deltap_depcue)

hist(a$v_nsubj_deltap_strgst)

hist(a$v_nsubj_MI)

hist(a$v_nsubj_T)

1.2.2 Dobj histograms

hist(a$v_dobj_deltap_govcue) #heavily skewed (right tail)

hist(a$v_dobj_deltap_depcue)

hist(a$v_dobj_deltap_strgst) #looks the nicest

hist(a$v_dobj_MI)

hist(a$v_dobj_T)

1.2.3 Adv mod

hist(a$v_advmod_deltap_govcue)

hist(a$v_advmod_deltap_depcue)

hist(a$v_advmod_deltap_strgst)

hist(a$v_advmod_MI)

hist(a$v_advmod_T)

1.2.4 Amod

hist(a$n_amod_deltap_govcue,breaks = 50) # some outliers

hist(a$n_amod_deltap_depcue,breaks = 50)

hist(a$n_amod_deltap_strgst,breaks = 50)

hist(a$n_amod_MI,breaks = 50)

hist(a$n_amod_T,breaks = 50)

1.3 Picking the variable based on the normally distributed indices

refined normal indices using python script…

varnames <- rbind("McD CD", "USF CD", "Verb—Nsubject DeltaP Dep", "Verb—Dobj (MI)", "Verb—Advmod Delta P Strongest", "Noun—Amod (MI)", "Adjective Frequency (Logged)", "Adverb Frequency (Logged)", "Main verb Frequency (Logged)", "Content word lemma Frequency", "Lemma bigram DeltaP Strongest")

IndexCategory <- rbind("Contextual distinctiveness", "Contextual distinctiveness", "Dependency bigram", "Dependency bigram", "Dependency bigram", "Dependency bigram", "Word Frequency", "Word Frequency", "Word Frequency", "Word Frequency", "Bigram")
#desc <- describe(a2[,2:12])
#desc$variable <- varnames
#write.csv(desc, "descriptives.csv")
#corr <- as.data.frame(cor(a2[,2:12]))
##corr$variable <- varnames
#write.csv(corr, "correlations.csv")

2 Correlation with the dependent variable

correlation <- corr.test(y = a2$Score, x = a2[,2:12], method = "pearson")

cor_result2 <- as.data.frame(cbind(varnames, IndexCategory,correlation$r, correlation$se))

cor_result <- cbind(varnames, IndexCategory,print(correlation, short = F, digit = 3))
## Call:corr.test(x = a2[, 2:12], y = a2$Score, method = "pearson")
## Correlation matrix 
##                          [,1]
## mcd                     0.363
## usf                    -0.332
## v_nsubj_deltap_depcue   0.133
## v_dobj_MI               0.224
## v_advmod_deltap_strgst  0.261
## n_amod_MI               0.254
## amod_freq_log          -0.208
## adv_manner_freq_log     0.134
## lex_mverb_freq_log     -0.129
## cw_lemma_freq_log      -0.193
## lemm_bg_deltap_strgst   0.174
## Sample Size 
## [1] 480
## Probability values  adjusted for multiple tests. 
##                         [,1]
## mcd                    0.000
## usf                    0.000
## v_nsubj_deltap_depcue  0.010
## v_dobj_MI              0.000
## v_advmod_deltap_strgst 0.000
## n_amod_MI              0.000
## amod_freq_log          0.000
## adv_manner_freq_log    0.010
## lex_mverb_freq_log     0.010
## cw_lemma_freq_log      0.000
## lemm_bg_deltap_strgst  0.001
## 
##  Confidence intervals based upon normal theory.  To get bootstrapped values, try cor.ci
##          raw.lower  raw.r raw.upper raw.p lower.adj upper.adj
## mcd-NA       0.282  0.363     0.438 0.000     0.245     0.470
## usf-NA      -0.410 -0.332    -0.250 0.000    -0.441    -0.214
## v_n__-NA     0.044  0.133     0.220 0.010     0.032     0.233
## v__MI-NA     0.137  0.224     0.307 0.000     0.104     0.337
## v_d__-NA     0.176  0.261     0.342 0.000     0.139     0.375
## n__MI-NA     0.168  0.254     0.336 0.000     0.133     0.367
## amd__-NA    -0.292 -0.208    -0.121 0.000    -0.320    -0.090
## ad___-NA     0.045  0.134     0.221 0.010     0.025     0.239
## lx___-NA    -0.216 -0.129    -0.040 0.010    -0.216    -0.040
## cw___-NA    -0.277 -0.193    -0.105 0.000    -0.303    -0.077
## lm___-NA     0.086  0.174     0.259 0.001     0.061     0.282
cor_result$cor_abs <- abs(cor_result$raw.r)
ggplot(cor_result, aes(x = reorder(varnames, cor_abs), y = raw.r, family = "serif", fill = IndexCategory, shape = IndexCategory), ymax = .5, ymin = -.5)+
  geom_bar( stat = "identity", width = .7) +
  geom_pointrange(aes(x = reorder(varnames, cor_abs), y =  raw.r, ymin = raw.lower, ymax = raw.upper, width = .15)) + 
  ylim(-1, 1) +
  geom_text(aes(y = -.79, label = raw.r, size = 1.7), hjust = "outward", family = "serif") +
  coord_flip() +
  theme_bw()+
  labs(x = "Lexical and phraseological indices", y = "Pearson Correlation Coefficients") +
  theme(legend.position="bottom") +
  scale_size(guide = 'none')
## Warning: Ignoring unknown aesthetics: width

3 Regression modeling

3.1 Constructing a full model

#create model, use dredge for all subsets.
full_mod <- lm(Score ~ .,data = a3)
summary(full_mod)
## 
## Call:
## lm(formula = Score ~ ., data = a3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.51442 -0.55351  0.00617  0.53979  1.98805 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.427083   0.034822  98.416  < 2e-16 ***
## mcd                     2.052053   0.476389   4.308 2.01e-05 ***
## usf                    -0.018305   0.005345  -3.425  0.00067 ***
## v_nsubj_deltap_depcue  -0.104989   1.032914  -0.102  0.91908    
## v_dobj_MI               0.298912   0.069814   4.282 2.25e-05 ***
## v_advmod_deltap_strgst  7.409734   1.687236   4.392 1.39e-05 ***
## n_amod_MI               0.180960   0.056137   3.224  0.00135 ** 
## amod_freq_log          -0.122507   0.049777  -2.461  0.01421 *  
## adv_manner_freq_log     0.025192   0.012963   1.943  0.05257 .  
## lex_mverb_freq_log      0.231498   0.108740   2.129  0.03378 *  
## cw_lemma_freq_log       0.267065   0.180847   1.477  0.14042    
## lemm_bg_deltap_strgst   1.154843   2.436164   0.474  0.63569    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7629 on 468 degrees of freedom
## Multiple R-squared:  0.2774, Adjusted R-squared:  0.2604 
## F-statistic: 16.33 on 11 and 468 DF,  p-value: < 2.2e-16
options(na.action = "na.fail")
drge <- dredge(full_mod)
## Fixed term is "(Intercept)"

3.2 Model selection through best-subset regression

sel_models_aic <- model.sel(drge)
sel_models_aic[1:10]

3.3 Model selection using BIC

sel_models_bic <- model.sel(drge, rank = BIC)
## New rank 'BIC' applied to logLik objects
sel_models_bic[1:10]
best_mod_aic <- lm(Score ~ adv_manner_freq_log+amod_freq_log+cw_lemma_freq_log + a2$lex_mverb_freq_log +mcd + n_amod_MI+ usf + v_advmod_deltap_strgst+v_dobj_MI, data = a3)
summary(best_mod_aic)
## 
## Call:
## lm(formula = Score ~ adv_manner_freq_log + amod_freq_log + cw_lemma_freq_log + 
##     a2$lex_mverb_freq_log + mcd + n_amod_MI + usf + v_advmod_deltap_strgst + 
##     v_dobj_MI, data = a3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.52236 -0.53524  0.00208  0.54040  1.98528 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.022047   1.149126   0.889 0.374236    
## adv_manner_freq_log     0.025578   0.012907   1.982 0.048097 *  
## amod_freq_log          -0.123154   0.049557  -2.485 0.013299 *  
## cw_lemma_freq_log       0.280646   0.178012   1.577 0.115569    
## a2$lex_mverb_freq_log   0.225877   0.107875   2.094 0.036805 *  
## mcd                     2.093641   0.453211   4.620 4.97e-06 ***
## n_amod_MI               0.180348   0.055818   3.231 0.001320 ** 
## usf                    -0.018370   0.005305  -3.463 0.000584 ***
## v_advmod_deltap_strgst  7.505786   1.664943   4.508 8.27e-06 ***
## v_dobj_MI               0.300919   0.069547   4.327 1.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7615 on 470 degrees of freedom
## Multiple R-squared:  0.277,  Adjusted R-squared:  0.2632 
## F-statistic: 20.01 on 9 and 470 DF,  p-value: < 2.2e-16

3.4 Best regression model 1

  • supression for the lexical frequency
  • removed from the model
best_mod_bic <- lm(Score ~ lex_mverb_freq_log +mcd + n_amod_MI+ usf + v_advmod_deltap_strgst+v_dobj_MI, data = a3)
summary(best_mod_bic)
## 
## Call:
## lm(formula = Score ~ lex_mverb_freq_log + mcd + n_amod_MI + usf + 
##     v_advmod_deltap_strgst + v_dobj_MI, data = a3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.10796 -0.51796  0.01449  0.51518  2.12062 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.427083   0.035035  97.819  < 2e-16 ***
## lex_mverb_freq_log      0.319191   0.093319   3.420 0.000679 ***
## mcd                     2.097248   0.430706   4.869 1.53e-06 ***
## n_amod_MI               0.214454   0.054412   3.941 9.33e-05 ***
## usf                    -0.017850   0.005167  -3.455 0.000600 ***
## v_advmod_deltap_strgst  7.301149   1.670503   4.371 1.52e-05 ***
## v_dobj_MI               0.309273   0.070054   4.415 1.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7676 on 473 degrees of freedom
## Multiple R-squared:  0.2607, Adjusted R-squared:  0.2513 
## F-statistic:  27.8 on 6 and 473 DF,  p-value: < 2.2e-16

3.5 Best regression model 2

best_mod_bic2 <- lm(Score ~ mcd + n_amod_MI+ usf + v_advmod_deltap_strgst+v_dobj_MI, data = a3)
summary(best_mod_bic2)
## 
## Call:
## lm(formula = Score ~ mcd + n_amod_MI + usf + v_advmod_deltap_strgst + 
##     v_dobj_MI, data = a3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2453 -0.5272  0.0060  0.5431  2.0018 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.427083   0.035428  96.733  < 2e-16 ***
## mcd                     1.338130   0.373266   3.585 0.000372 ***
## n_amod_MI               0.205839   0.054964   3.745 0.000203 ***
## usf                    -0.017839   0.005225  -3.414 0.000695 ***
## v_advmod_deltap_strgst  7.681274   1.685509   4.557  6.6e-06 ***
## v_dobj_MI               0.273038   0.070026   3.899 0.000110 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7762 on 474 degrees of freedom
## Multiple R-squared:  0.2424, Adjusted R-squared:  0.2344 
## F-statistic: 30.33 on 5 and 474 DF,  p-value: < 2.2e-16
BIC(best_mod_bic2)
## [1] 1156.138

3.6 Final model information in a table format

tab_model(best_mod_bic2, show.std = T, show.se = T, show.aic = T, col.order = c("est", "se",  "p", "std.est", "std.ci", "ci.inner", "ci.outer", "stat", "df",
  "response.level"), digits = 3)
  Score
Predictors Estimates std. Error p std. Beta standardized CI
(Intercept) 3.427 0.035 <0.001 -0.000 -0.078 – 0.078
mcd 1.338 0.373 <0.001 0.177 0.080 – 0.274
n_amod_MI 0.206 0.055 <0.001 0.154 0.073 – 0.235
usf -0.018 0.005 0.001 -0.165 -0.260 – -0.070
v_advmod_deltap_strgst 7.681 1.686 <0.001 0.187 0.106 – 0.267
v_dobj_MI 0.273 0.070 <0.001 0.159 0.079 – 0.239
Observations 480
R2 / R2 adjusted 0.242 / 0.234
AIC 1126.922

3.7 Calculating importance

calc.relimp(best_mod_bic2)
## Response variable: Score 
## Total response variance: 0.7869476 
## Analysis based on 480 observations 
## 
## 5 Regressors: 
## mcd n_amod_MI usf v_advmod_deltap_strgst v_dobj_MI 
## Proportion of variance explained by model: 24.24%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##                               lmg
## mcd                    0.06727746
## n_amod_MI              0.03833927
## usf                    0.05654548
## v_advmod_deltap_strgst 0.04631796
## v_dobj_MI              0.03392807
## 
## Average coefficients for different model sizes: 
## 
##                                1X        2Xs         3Xs         4Xs
## mcd                     2.7425412  2.3463513  1.98241922  1.64750105
## n_amod_MI               0.3391793  0.2857634  0.24860956  0.22333768
## usf                    -0.0359896 -0.0298037 -0.02483311 -0.02089771
## v_advmod_deltap_strgst 10.7335938  9.3988633  8.52962411  7.98720763
## v_dobj_MI               0.3844653  0.3341693  0.30241797  0.28324401
##                                5Xs
## mcd                     1.33812951
## n_amod_MI               0.20583856
## usf                    -0.01783859
## v_advmod_deltap_strgst  7.68127362
## v_dobj_MI               0.27303775

3.8 Marginal effects plot

plot_model(best_mod_bic2, type = 'pred')
## $mcd

## 
## $n_amod_MI

## 
## $usf

## 
## $v_advmod_deltap_strgst

## 
## $v_dobj_MI

3.9 Checking the assumption

3.9.1 residual plots

plot(best_mod_bic2)

3.9.2 Collinearity diagnosis

1/vif(best_mod_bic2)
##                    mcd              n_amod_MI                    usf 
##              0.6556000              0.9462629              0.6861909 
## v_advmod_deltap_strgst              v_dobj_MI 
##              0.9519354              0.9596180